Preliminaries

Load libraries

If you get an error go to the packages panel, click install and type the name of the library (you only need to do this once).

library(MLTools)
library(fpp2)
library(ggplot2)
library(readxl)
library(lmtest)  #contains coeftest function
library(tseries) #contains adf.test function
library(tidyverse)
library(TSstudio)

## Set working directory ---------------------------------------------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

Car Registration Example

Load dataset

fdata <- read_excel("CarRegistrations.xls")

Visualize the first rows:

head(fdata)
## # A tibble: 6 × 2
##   Fecha               CarReg
##   <dttm>               <dbl>
## 1 1960-01-01 00:00:00   3.36
## 2 1960-02-01 00:00:00   4.38
## 3 1960-03-01 00:00:00   4.06
## 4 1960-04-01 00:00:00   5.38
## 5 1960-05-01 00:00:00   6.31
## 6 1960-06-01 00:00:00   4.02

Convert character columns to R date type

fdata$Fecha <- as.Date(fdata$Fecha, format = "%d/%m/%Y")
head(fdata)
## # A tibble: 6 × 2
##   Fecha      CarReg
##   <date>      <dbl>
## 1 1960-01-01   3.36
## 2 1960-02-01   4.38
## 3 1960-03-01   4.06
## 4 1960-04-01   5.38
## 5 1960-05-01   6.31
## 6 1960-06-01   4.02

Order the table by date (using the pipe operator)

fdata <- fdata %>% arrange(Fecha)

# same as: fdata <- arrangeFecha# same as: fdata <- arrange(fdata,DATE)

Check for missing dates (time gaps)

range(fdata$Fecha)
## [1] "1960-01-01" "1999-12-01"

Create a complete sequence of months with the same range and compare it to the dates in our data.

date_range <- seq.Date(min(fdata$Fecha), max(fdata$Fecha), by = "months")

Now we do the comparison

date_range[!date_range %in% fdata$Fecha] 
## Date of length 0

We also check for duplicates

fdata %>% 
  select(Fecha) %>% 
  duplicated() %>% 
  which()
## integer(0)

Check for missing data (NA) in time series value columns

sum(is.na(fdata$CarReg))
## [1] 0

Convert it to a time series object

This is monthly data, so the frequency is 12. The series start in January. If it starts in e.g. April we would use start = c(1960, 4)

freq <- 12
start_date  <-  c(1960, 1)
y <- ts(fdata$CarReg, start = start_date, frequency = freq)

Time plot:

autoplot(y) +
  ggtitle("Car registrations") +
  xlab("Year") + ylab("Number registered (thousands)")

Train/test split

n <- length(y)

Leave 5 years for validation

n_test <- floor(5 * freq)

y_split <- ts_split(y, sample.out = n_test)
y.TR <- y_split$train
y.TV <- y_split$test

Alternatively, with subset

y.TR <- subset(y, end = length(y) - 5 * 12) 
y.TV <- subset(y, start = length(y) - 5 * 12 + 1)

And let us visualize the split.

autoplot(y.TR, color = "orange") + 
  autolayer(y.TV, color = "blue")

Identification and fitting process

Box-Cox transformation

Lambda <- BoxCox.lambda.plot(y.TR, window.width = 12)
## `geom_smooth()` using formula = 'y ~ x'

z <- BoxCox(y.TR, Lambda)
p1 <- autoplot(z)
p2 <- autoplot(y)
gridExtra::grid.arrange(p1, p2, nrow = 2 )

Differentiation

recall if the ACF decreases very slowly -> needs differentiation

ggtsdisplay(z,lag.max = 100)

Seasonal Differentiation

It is generally better to start with seasonal differencing when a time series exhibits both seasonal and non-seasonal patterns.

B12z<- diff(z, lag = freq, differences = 1)

We use the ACF and PACF to inspect the result

ggtsdisplay(B12z,lag.max = 4 * freq) 

Regular Differentiation (without seasonal)

Bz <- diff(z,differences = 1)
ggtsdisplay(Bz, lag.max = 4 * freq) 

Both regular & seasonal Differentiation

B12Bz <- diff(Bz, lag = freq, differences = 1)
ggtsdisplay(B12Bz, lag.max = 4 * freq) 

Remember, when you apply both differences the order does not matter. Here we check that visually:

B_B12z<- diff(B12z, differences = 1)
autoplot(B12Bz, color = "blue", size = 2) + autolayer(B_B12z, color = "orange", size = 0.5)

Model Order (p, d, q)(P, D, Q)_s selection

Now, using the results above, we select both the regular d and the seasonal D orders of differencing and the ARMA structure, both regular and seasonal

p <- 0
d <- 1
q <- 1

P <- 1
D <- 1
Q <- 1

Fit seasonal model with estimated order

arima.fit <- Arima(y.TR,
                   order=c(p, d, q),
                   seasonal = list(order=c(P, D, Q), period=freq),
                   lambda = Lambda,
                   include.constant = FALSE)

Model Diagnosis

Summary of training errors and estimated coefficients

summary(arima.fit) 
## Series: y.TR 
## ARIMA(0,1,1)(1,1,1)[12] 
## Box Cox transformation: lambda= -0.02149828 
## 
## Coefficients:
##           ma1    sar1     sma1
##       -0.5936  0.2317  -0.9189
## s.e.   0.0435  0.0611   0.0394
## 
## sigma^2 = 0.01329:  log likelihood = 294.41
## AIC=-580.82   AICc=-580.72   BIC=-564.78
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.2509503 5.757222 3.958458 -1.724802 9.275896 0.5753384
##                     ACF1
## Training set -0.04836602

Statistical significance of estimated coefficients

coeftest(arima.fit) 
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ma1  -0.593623   0.043517 -13.6413 < 2.2e-16 ***
## sar1  0.231663   0.061055   3.7943  0.000148 ***
## sma1 -0.918918   0.039365 -23.3437 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Root plot

autoplot(arima.fit)

Check residuals

CheckResiduals.ICAI(arima.fit, bins = 100, lag=100)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(1,1,1)[12]
## Q* = 80.597, df = 97, p-value = 0.8855
## 
## Model df: 3.   Total lags used: 100
# If residuals are not white noise, change order of ARMA
ggtsdisplay(residuals(arima.fit), lag.max = 100)

Check the fitted values

autoplot(y.TR, series = "Real") +
  forecast::autolayer(arima.fit$fitted, series = "Fitted")

Future forecast and validation

y_est <- forecast::forecast(y.TR, model=arima.fit, h=freq)
head(y_est$mean, n = 12)
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 1995  73.04602  79.74579 100.42090  90.57890  98.39497 101.59996 113.57741
##            Aug       Sep       Oct       Nov       Dec
## 1995  70.23188  70.47925  87.31780  83.85101  94.95915
xmin <- 1995
xmax <- 1996
ymin <- 0
ymax <- Inf

autoplot(subset(y, start=length(y.TR) - 30, end=length(y.TR) + freq)) + 
  autolayer(y_est, alpha = 0.5) +
  annotate("rect", xmin=xmin, xmax=xmax, ymin=ymin, ymax= ymax, alpha=0.2, fill="orange") 

Validation error for h = 1

Obtain the forecast in validation for horizon = 1 using the trained parameters of the model. We loop through the validation period, adding one point of the time series each time and forecasting the next value (h = 1).

y.TV.est <- y * NA 
for (i in seq(length(y.TR) + 1, length(y), 1)){
  y.TV.est[i] <- forecast::forecast(subset(y, end=i-1), 
                          model = arima.fit,       
                          h=1)$mean                
}

Next we plot the series and both forecasts. We limit the values depicted in the plot to improve the visualization. As you can see the loop forecasts (with h = 1) appear to be better than the h=12 forecasts in y_est.

autoplot(subset(y, start=length(y.TR) - 24, end = length(y.TR) + 12)) +
  forecast::autolayer(y_est$mean, color="blue") + 
  forecast::autolayer(subset(y.TV.est, start = length(y.TR) + 1, 
                             end = length(y.TR) + 12), color="red")

# autoplot(subset(y, start=length(y.TR) - 24)) +
#   forecast::autolayer(y_est$mean, color="blue") + 
#   forecast::autolayer(subset(y.TV.est, start=length(y.TR) + 1), color="red")

Finally we compute the validation errors

accuracy(subset(y.TV.est, start = length(y.TR) + 1), y)
##                ME     RMSE      MAE       MPE   MAPE       ACF1 Theil's U
## Test set 1.147887 8.781182 6.889705 -0.044289 7.4906 -0.2271263 0.3753469

Uncomment the following line to see the direct computation of RSME

#  sqrt(mean((y.TV.est - y.TV)^2))

Additional Code

We have seen above that the result of using forecast is different from the result of the validation loop. Both are using the fitted model to forecast, so what is the difference?

Below we will explicitly use the fitted model coefficients to obtain the forecasted values, both for the validation loop and for the forecast function. Keep in mind that the Arima function incorporates the BoxCox transformation. So to make things simpler, we will work directly with z instead of y.

Recall that the equation of our model is: \[ (1 - \Phi_1B^{12})(1 - B)(1 - B^{12})z_t = (1 - \theta_1 B)(1 - \Theta_1 B^{12})\epsilon_t \] But we can expand this equation and reorganize it into a forecasting equation: \[ z_t = z_t = z_{t-1} + z_{t-12} - z_{t-13} + \Phi_s (z_{t-12} - z_{t-13} - z_{t-24} + z_{t-25}) + \\ \epsilon_t + \theta \epsilon_{t-1} + \Theta_s \epsilon_{t-12} + \theta \Theta_s \epsilon_{t-13} \] Similarly we can do the same with the error term to get \[ \epsilon_t = z_t - z_{t-1} - z_{t-12} + z_{t-13} - \phi_s (z_{t-12} - z_{t-13} - z_{t-24} + z_{t-25}) - \\ \theta \epsilon_{t-1} - \Theta_s \epsilon_{t-12} - \theta \Theta_s \epsilon_{t-13} \] The right hand side of these equation only contains lags of \(z_t\) and \(\epsilon_t\). Section 9.8 of Hyndman and Athanasopoulos (2021) explains how to apply this forecasting equations. In summary: when you start forecasting the first values you replace past values of \(z\) with training set values and \(\epsilon\) values with the residuals of the fitted model (also corresponding to training). As you move further into the test set, unknown values of \(z\) and \(\epsilon\) will be needed. The difference between forecast and the validation loop is in deciding what values we use for \(z\) and \(\epsilon\) for \(t\) values corresponding to the test set. Let us explore this in the code. So we begin by splitting z into training and test.
Technical note: Lambda was determined using only the training set, and we apply the same Lambda to the test set to prevent data leakage.

z <- BoxCox(y, Lambda)
z.TR <- subset(z, end = length(y.TR))
z.TV <- subset(z, start = length(y.TR) + 1)

We fit a seasonal model to z with the same order we used for y, but setting lambda equal to 1.

arima.fit <- Arima(z.TR, 
                   order=c(p, d, q),
                   seasonal = list(order=c(P, D, Q), period=12),
                   lambda = 1, 
                   include.constant = FALSE)

In the code below you can verify that the model fit is equivalent to what we obtained above for y.

summary(arima.fit) 
## Series: z.TR 
## ARIMA(0,1,1)(1,1,1)[12] 
## Box Cox transformation: lambda= 1 
## 
## Coefficients:
##           ma1    sar1     sma1
##       -0.5936  0.2317  -0.9189
## s.e.   0.0435  0.0611   0.0394
## 
## sigma^2 = 0.01329:  log likelihood = 294.41
## AIC=-580.82   AICc=-580.72   BIC=-564.78
## 
## Training set error measures:
##                        ME      RMSE        MAE        MPE     MAPE      MASE
## Training set -0.008851714 0.1130747 0.08373548 -0.3647752 2.684318 0.5294322
##                    ACF1
## Training set 0.02378263
coeftest(arima.fit) 
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ma1  -0.593623   0.043517 -13.6413 < 2.2e-16 ***
## sar1  0.231663   0.061055   3.7943  0.000148 ***
## sma1 -0.918918   0.039365 -23.3437 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
autoplot(arima.fit) 

CheckResiduals.ICAI(arima.fit, bins = 100, lag=100)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(1,1,1)[12]
## Q* = 80.576, df = 97, p-value = 0.8858
## 
## Model df: 3.   Total lags used: 100

Now let us begin using this model to forecast the test set values.

We begin by using the forecast function to forecast all the values in the test set. That is, we set the forecasting horizon h equal to the length of the test set,

z_est <- forecast::forecast(arima.fit, h=length(z.TV))

Next we will generate three additional versions of the forecasts, using a loop as we did before.

  • The first forecast z.TV.est is exactly what we did before, but using z instead of y.
  • The second one z.TV.est2 will use the two forecasting equations (with the fitted coefficients) and will always use actual values of \(z_t\) in the test set. The error terms will also be updated with the second forecasting equation.
  • The third one z.TV.est3 will also use the forecasting equation, but it will use its own forecasted values of \(z_t\) in the test set. The error terms for \(t\) in the test set will all be set to zero. First we create empty time series to store the different forecasts.
z.TV.est <- z * NA
z.TV.est2 <- z * NA
z.TV.est3 <- z * NA

We make the training values of \(z_t\) available to the forecasting equation.

z.TV.est[1:length(z.TR)] <- z.TR
z.TV.est2[1:length(z.TR)] <- z.TR
z.TV.est3[1:length(z.TR)] <- z.TR

Similarly, we prepare two versions of \(\epsilon_t\) containing the training residuals, to be used by the second and third procedure. Even though they look initially the same, these error terms are updated differently: the second procedure really updates them with a forecast equation, whereas the third one leaves the test error values as 0.

w2 <- z * 0
w2[1:length(z.TR)] <- residuals(arima.fit)

w3 <- z * 0
w3[1:length(z.TR)] <- residuals(arima.fit)

We store the coefficients of the model (_s indicates seasonal)

Phi_s <- coefficients(arima.fit)["sar1"]

theta <- coefficients(arima.fit)["ma1"]
Theta_s <- coefficients(arima.fit)["sma1"]

And now we get the forecasts in a loop.

for (i in seq(length(z.TR) + 1, length(y), 1)){# loop for validation period
  
  # The first one is simply what we did in the validation loop above
  
  z.TV.est[i] <- forecast::forecast(subset(z, end=i-1), 
                                    model = arima.fit,  
                                    h=1)$mean           
  
  
  
  # In the second forecast procedure we use the two forecasting equations, with 
  # real values of z and updating the errors with the second equation. 
  
  z.TV.est2[i] <- z[i - 1] + z[i - 12] - z[i-13] + 
    Phi_s * (z[i-12] - z[i-13] - z[i - 24] + z[i-25]) + 
    w2[i] + theta * w2[i - 1] +  Theta_s * w2[i - 12] + theta * Theta_s * w2[i - 13]
  
  w2[i] = z[i] - z[i-1] - z[i-12] + z[i-13] -
    Phi_s * (z[i-12] - z[i-13] - z[i-24] + z[i-25]) - 
    theta * w2[i-1] - Theta_s * w2[i-12] - theta * Theta_s * w2[i-13]
  
  # And in the third one we update the forecasted values z.TV.est3 using their 
  # previous values and we do not update the error terms (they stay 0)
  

  z.TV.est3[i] <- z.TV.est3[i - 1] + z.TV.est3[i - 12] - z.TV.est3[i-13] + 
    Phi_s * (z.TV.est3[i-12] - z.TV.est3[i-13] - z.TV.est3[i - 24] + z.TV.est3[i-25]) + 
    w3[i] + theta * w3[i - 1] +  Theta_s * w3[i - 12] + theta * Theta_s * w3[i - 13]
  
}

Let us examine the results, comparing the first values for all the forecast procedures:

k <- 10

Using forecast function directly (we called this s_est above):

head(z_est$mean, k)
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1995 4.099109 4.179055 4.388351 4.294830 4.369889 4.398921 4.499703 4.063269
##           Sep      Oct
## 1995 4.066478 4.261535

Using the validation loop as explained in today’s session

subset(z.TV.est, start = length(z.TR) + 1, end = length(z.TR) + k)
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1995 4.099109 4.070260 4.258979 4.200732 4.243077 4.260375 4.427713 3.877607
##           Sep      Oct
## 1995 3.853199 4.023954

Using the two forecast equations with actual z values and error updates

subset(z.TV.est2, start = length(z.TR) + 1, end = length(z.TR) + k)
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1995 4.099130 4.070274 4.258966 4.200749 4.243050 4.260350 4.427711 3.877596
##           Sep      Oct
## 1995 3.853198 4.023978

Using only the forecasting equation for y and no error update

subset(z.TV.est3, start = length(z.TR) + 1, end = length(z.TR) + k)
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1995 4.099130 4.179070 4.388345 4.294849 4.369871 4.398893 4.499688 4.063245
##           Sep      Oct
## 1995 4.066459 4.261540

The results indicate that, up to small rounding errors there are only two different results:

  • the validation loop is doing what we called the second procedure.
  • the forecast function is doing what twe called the third procedure.

The updated error values and the use of actual values of the time series explain why the validation loop produces more accurate values. This is illustrated in the plot below:

autoplot(subset(z, start = length(z.TR) - 30), series="Real") +
  forecast::autolayer(subset(z.TV.est, start = length(z.TR)), series="Forecasting Loop") +
  forecast::autolayer(subset(z.TV.est3, start = length(z.TR)), series="Forecast function")

Now we can also compare both approaches through their validation errors.

accuracy(z_est, z)
##                        ME      RMSE        MAE        MPE     MAPE      MASE
## Training set -0.008851714 0.1130747 0.08373548 -0.3647752 2.684318 0.5294322
## Test set     -0.082613246 0.1493240 0.12528694 -2.0788375 2.996520 0.7921486
##                    ACF1 Theil's U
## Training set 0.02378263        NA
## Test set     0.56407002 0.6401048
accuracy(z.TV.est, z)
##                    ME     RMSE         MAE         MPE      MAPE       ACF1
## Test set 0.0004523454 0.030467 0.008420289 0.003263154 0.1975652 -0.1771778
##          Theil's U
## Test set 0.1090504

As expected, direct use of the forecast function leads to worse predictive performance in the test set.

References

Hyndman, R. J., and G. Athanasopoulos. 2021. Forecasting: Principles and Practice. OTexts. https://otexts.com/fpp3/.